#!/usr/local/bin/python

"""
Author: Shop Mallick
Contact: shop.mallick@gmail.com
Date: Jun 1 2020

Usage: zcat mtDB.gz | python mtdna_uncompress_v2.py [options]

Example:
(a) Extract all mitogenomes:
zcat mtDB.gz | python mtdna_uncompress_v2.py

(b) Extract a specific single mitogenome, with id 'I0000'

(c) Extract a set of mitogenomes with ids in the list 'I0000','I000a', 'I000b'

(d) Extract a set of mitogenomes with ids in the file: 'myfile.ids'


Notes: uses compression scheme developed by Nick Patterson in cTools from the SGDP project.


Update:
[2021_08_Feb]: data structure simplified, empty sequencess stripped

"""

import sys, os
from optparse import OptionParser

usage = "usage: zcat mtDB.gz | python %prog  [options] "
parser = OptionParser(usage=usage, version="%prog v2")
parser.add_option( "-i", "--id", action="store", type="string", dest="idset", help="extract mitogenomes for a set of ids",default=False)
parser.add_option( "-f", "--file", action="store", type="string", dest="idfile", help="extract a list of mitogenomes from file (will only list the ones in idset, if specified)",default=False)
parser.add_option( "-c", "--check", action="store_true", dest="check", help="extract a list of mitogenomes from file (will only list the ones in idset, if specified)",default=False)
(options, args) = parser.parse_args()

if options.idset:
  nids= len( options.idset.split(","))
  sys.stderr.write( "# looking for %i ID(s): idset=%s\n" % (nids, options.idset ))

rsrs = """
GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCCCATC
CCATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTAAAT
GTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA
AACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA
AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTT
TATCTTTTGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACATT
ATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCCGC
CCATCCTACCCAGCACACACACNNCGCTGCTAACCCCATACCCCGAACCA
ACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAA
GCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAAT
AGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA
GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGG
ACAAGCATCAAGCACGCAACAATGCAGCTCAAAACGCTTAGCCTAGCCAC
ACCCCCACGGGAAACAGCAGTGATAAACCTTTAGCAATAAACGAAAGTTT
AACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGC
GGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTA
GATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACT
CCAGTTGACACAAAATAAACTACGAAAGTGGCTTTAACATATCTGAACAC
ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCC
TAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGC
CACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGG
AGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTC
AGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAA
GCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTG
GCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT
GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTGAGAGTAGAGTGC
TTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTC
AAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGA
GGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAA
CCAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCA
ACTTAACTTGACCGCTCTGAGCTAAACCTAGCCCCAAACCCACTCCACCT
TACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG
ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATG
AAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCT
GCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGA
CCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGT
CTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTAC
CGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTA
AATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC
CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGT
AAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAA
GCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGA
ACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATG
TTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGA
TTAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCAAC
AAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA
AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTA
CCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCA
GTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATA
ATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTC
AGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCG
GGCATGACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAAT
GCAAACAATACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT
AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGT
ACATGCTAAGACTTCACCAGTCAAAGCGAACTACCATACTCAATTGATCC
AATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCT
ATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCA
GGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATT
AAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCT
ATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT
ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATT
ATACCCACACCCACCCAAGAACAGGGTTTGTTAAGATGGCAGAGCCCGGT
AATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAATTCCTCTTCTT
AACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAAT
CGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATAC
AACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCC
TTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC
ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCG
CTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTTAACCTC
AACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTC
AATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCG
CACTGCGAGCAGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATC
ATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCT
TATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG
CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTC
GACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATA
CGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTA
TTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGAC
GCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACT
TCTGACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGCT
ACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA
GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCC
CCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTA
AATAATAGGAGTTTAAACCCCCTTATTTCTAGGACTATGAGAATCGAACC
CATCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTA
AAGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGG
TTATACCCTTCCCGTACTAATTAATCCCCTGGCCCAACCCGTCATCTACT
CTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT
TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCT
AACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTCC
TCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTATCCTCTTCAAC
AATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTC
ATCATTAATAATCATAATGGCTATAGCAATAAAACTAGGAATAGCCCCCT
TTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGGC
CTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA
AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTAT
CCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAAA
ATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTCT
ACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCC
TAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACC
CTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAAT
TCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT
TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATC
ATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCT
ACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAACG
TAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCC
ACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACT
AATAATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGC
CCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA
CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAG
CCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGCT
AAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAAA
AAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAA
TTCAATATGAAAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGT
CTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCTCACCCCCA
CTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG
AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTC
TAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGGT
AACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAAT
CTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAG
TTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAAC
ATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTGC
TATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG
CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCC
TTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCAC
AACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCCC
TCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTC
CCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAA
CACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAAC
ACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA
GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGA
ACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATTGGCTTCC
TAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGTA
GACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCAC
CGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGA
AATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCACC
GTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT
ACTACACGACACGTACTACGTTGTAGCTCACTTCCACTATGTCCTATCAA
TAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATTTCCCCTA
TTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCGCTAT
CATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCC
TATCCGGAATGCCCCGACGTTACTCGGACTACCCCGATGCATACACCACA
TGAAATATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAAT
ATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC
TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCC
CCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTAGACAAAA
AAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAACCCCATGGCC
TCCATGACTTTTTCAAAAAGATATTAGAAAAACCATTTCATAACTTTGTC
AAAGTTAAATTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCG
CAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCAC
CTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC
TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATC
TCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCAT
CCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACG
AGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTAC
TGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACAT
ACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTG
ACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA
TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAAC
AGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGAC
CGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGT
TTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGG
GCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGT
AAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAACA
CCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT
AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATAT
TAAACACAAACTACCACTTACCTCCCTCACCAAAGCCCATAAAAATAAAA
AATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCA
TTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCT
ATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGAC
TAATTACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATA
GCCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT
AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCAT
TTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTTA
TGAGCGGGCGCAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCCT
AGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAG
TTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTA
CGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGG
AAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA
TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTA
ATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAA
CACATAATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCC
ATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAG
CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTA
ACCAACACACTAACCATATACCAATGATGGCGCGATGTAACACGAGAAAG
CACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG
GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTC
TGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAACTAGGAGGGCA
CTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTCC
TAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCAC
CATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTAT
TACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTACT
TCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT
GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCT
CACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACATC
ACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGTT
TGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAG
TATAAATAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAA
AAAGAGTAATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGCC
TTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT
AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCC
GCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATTA
TTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAAC
AACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCC
TAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAGCC
GAATTGGTATATAGTTTAAACAAAACGAATGATTTCGACTCATTAAATTA
TGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG
CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATA
TCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGC
TACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCTA
TTGCCATACTAGTTTTTGCCGCCTGCGAAGCAGCGGTAGGCCTAGCCCTA
CTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAA
CCTACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCACT
GACATGACTCTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA
GCCTAATTATTAGCATCATCCCCCTACTATTTTTTAACCAAATCAACAAC
AACCTATTTAGCTGCTCCCCAACCTTTTCCTCCGACCCCCTAACAACCCC
CCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGCC
AACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCT
ATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGA
ACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTGG
CTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA
TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACT
AATTTACACTCACAACACCCTAGGCTCACTAAACATTCTACTACTCACTC
TCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTTAATATGACTA
GCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTT
ATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTAC
TTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCACA
CTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT
ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAG
ACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCGTA
GTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCAT
TCTCATAATCGCCCACGGACTTACATCCTCATTACTATTCTGCCTAGCAA
ACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGA
CTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCT
CGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG
TGCTAGTAACCACATTCTCCTGATCAAATATCACTCTCCTACTTACAGGA
CTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAAC
ACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCA
CACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTA
TCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTT
AACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTACGACCCCTTA
TTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC
AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAG
GCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACAC
TACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCA
CCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCC
ATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCAT
GTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCC
AAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA
TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACT
GTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTAC
TCATTTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATTC
CAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCAT
CAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAA
TCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGA
TTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA
CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAG
GCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAA
GGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGC
AGGAGTCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCAC
TAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCA
GCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTC
CACTTCAAGTCAACTAGGACTCATAGTAGTTACAATCGGCATCAACCAAC
CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATA
CTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGA
TATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCT
CCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGT
TTCTATTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAA
CGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCT
ATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCT
ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAA
ACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACA
TTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTC
ACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAA
CTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATT
TCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCC
TATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT
AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCT
CCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTC
CTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAACC
TATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTT
CAACCAGTAACTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGC
ACCAATAGGATCCTCCCGAATCAACCCTGACCCCTCTCCTTCATAAATTA
TTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC
TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAAC
ACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCAA
TAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCCCCCTAAATAA
ATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAAT
AACACACCCGACCACACCGCTAACAATCAATACTAAACCCCCATAAATAG
GAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTAAACCCACACTC
AACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC
CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGA
CCCCAATACGCAAAATTAACCCCCTAATAAAATTAATTAACCACTCATTC
ATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTC
ACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAG
CCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCAC
ATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAA
TGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC
TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATC
CTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTG
AGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAACTTACTATCCG
CCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTAC
TCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTT
GCCCTTCATTATTGCAGCCCTAGCAGCACTCCACCTCCTATTCTTGCACG
AAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC
ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTT
CCTTCTCTCCTTAATGACATTAACACTATTCTCACCAGACCTCCTAGGCG
ACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACATC
AAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCC
TAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAG
CAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCGC
CCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT
AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGAC
AAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAACT
ATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTAT
AAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGG
ACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAG
ATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGT
ACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA
TTACTGCCAGCCACCATGAATATTGTACAGTACCATAAATACTTGACCAC
CTGTAGTACATAAAAACCCAATCCACATCAAAACCCTCCCCCCATGCTTA
CAAGCAAGTACAGCAATCAACCTTCAACTGTCACACATCAACTGCAACTC
CAAAGCCACCCCTCACCCACTAGGATATCAACAAACCTACCCACCCTTAA
CAGTACATAGCACATAAAGCCATTTACCGTACATAGCACATTACAGTCAA
ATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGAC
CACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG
CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACAT
CTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCT
TAAATAAGACATCACGATG""".replace("\n","")


seq_in=[]
get_ids={}

if options.idset:
  [ get_ids.setdefault(x,1) for x in options.idset.split(',') ]

if options.idfile:
  print "# reading IDs from file"

  if not os.path.exists( options.idfile ):
    print "Yikes: no id file: .%s. - quitting"
    exit
    
  fh = open( options.idfile )
  for l in fh.xreadlines():
    l = l.strip()
    get_ids[l]=1

n_get_ids=len(get_ids)
hit={}

ndumped=0
for l in sys.stdin:
  l = l.strip()
  v = l.split("\t")
  idd = v[0].replace(">","")

  flag_dump=0
  if True:
    # dump?
    if get_ids.has_key( idd ) or n_get_ids==0:
      if options.check:
        print "found: %s" % idd
        continue

      seq_in = v[1]
      seq_in = "%s%s" % ( seq_in, "-" * ( len(rsrs)-len(seq_in) ))
      x = [ x[0].replace( 'Q', x[1] ) for x in zip( seq_in, rsrs ) ]
      x = "".join(x)
      print ">%s" % idd
      print "\n".join( [ x[i:i+60] for i in range( 0,len(x),60 )] )
      ndumped+=1

    hit[ idd ] = 1
    
    continue
    
# report
nhit=len(hit)
sys.stderr.write( "# Number of ( ids_requested, scanned, outputted ) = (%i, %i, %i)\n" % ( n_get_ids, nhit, ndumped  ))
if n_get_ids>0 and nhit<n_get_ids:
  sys.stderr.write( "# %i requested ids could not be found:\n" % ( n_get_ids-nhit))
  sys.stderr.write( "%s\n"  % "\n".join( [ "%i: %s (missing)" % (n,x) for n,x in enumerate( get_ids ) if x not in hit ]) )
                   
  
